The data flow for this project requires several files:
In [1]:
import json
import re
import operator
from collections import defaultdict
import pandas as pd
In [2]:
ls
In [3]:
doi_list = open('doi_list.txt')
In [4]:
doi_lines = doi_list.readlines()
In [5]:
len(doi_lines)
Out[5]:
In [6]:
page_dois = defaultdict(list)
doi_pages = defaultdict(list)
prefixes = defaultdict(int)
In [7]:
for line in doi_lines:
parts = re.split(r'\t|\n', line)
page_title = parts[0]
doi = parts[1].strip()
#if len(doi) != len(parts[1]):
# print(parts[1], doi)
if doi and (doi.lower() != 'noedit'):
page_dois[page_title].append(doi)
doi_pages[doi].append(page_title)
prefix = doi.split('/')[0]
prefixes[prefix] += 1
In [8]:
for doi, pages in doi_pages.iteritems():
if doi.startswith(' '):
print(doi, doi.strip())
In [9]:
num_page_dois = {page: len(dois) for page, dois in page_dois.iteritems()}
num_doi_pages = {doi: len(pages) for doi, pages in doi_pages.iteritems()}
In [10]:
npd = pd.DataFrame.from_dict(data=num_page_dois, orient='index')
ndp = pd.DataFrame.from_dict(data=num_doi_pages, orient='index')
prefixdf = pd.DataFrame.from_dict(data=prefixes, orient='index')
In [11]:
npdc = npd.convert_objects(convert_numeric=True)
ndpc = ndp.convert_objects(convert_numeric=True)
prefixdfc = prefixdf.convert_objects(convert_numeric=True)
In [30]:
npdc.sort([0], ascending=False).head(20)
Out[30]:
In [12]:
print npdc.sort([0], ascending=False).head(10).to_html(justify='left')
In [13]:
print ndpc.sort([0], ascending=False).head(11).to_html(justify='left')
In [14]:
print ndpc.sort([0], ascending=False).head(11).index
In [15]:
print prefixdf.sort([0], ascending=False).head(10).to_html()
In [16]:
page_views_list = json.load(open('page_views_all.json', 'r'))
page_views = {i[0]: i[1] for i in page_views_list}
def total_page_views(page_list):
view_count = 0
for page in page_list:
try:
view_count += page_views[page.decode('utf-8')]
except:
pass
#print(page)
return view_count
doi_views = {doi: total_page_views(page_list) for doi, page_list in doi_pages.iteritems() }
viewsdf = pd.DataFrame.from_dict(data=doi_views, orient='index')
In [18]:
print viewsdf.sort([0], ascending=False).head(10).to_html()
In [23]:
%pylab inline
In [29]:
npd.describe()
Out[29]:
In [42]:
len(npdc[npdc[0] == 1]) / float(len(npdc))
Out[42]:
In [35]:
p = npd.hist(bins=40, log=True)
In [ ]: